To illustrate how to use pyLDAvis's gensim helper funtions we will create a model from the 20 Newsgroup corpus. Minimal preprocessing is done and so the model is not the best, the goal of this notebook is to demonstrate the the helper functions.
In [1]:
%%bash
mkdir -p data
pushd data
if [ -d "20news-bydate-train" ]
then
echo "The data has already been downloaded..."
else
wget http://qwone.com/%7Ejason/20Newsgroups/20news-bydate.tar.gz
tar xfv 20news-bydate.tar.gz
rm 20news-bydate.tar.gz
fi
echo "Lets take a look at the groups..."
ls 20news-bydate-train/
popd
Each group dir has a set of files:
In [2]:
ls -lah data/20news-bydate-train/sci.space | tail -n 5
Lets take a peak at one email:
In [3]:
!head data/20news-bydate-train/sci.space/61422 -n 20
In [4]:
from glob import glob
import re
import string
import funcy as fp
from gensim import models
from gensim.corpora import Dictionary, MmCorpus
import nltk
import pandas as pd
In [5]:
# quick and dirty....
EMAIL_REGEX = re.compile(r"[a-z0-9\.\+_-]+@[a-z0-9\._-]+\.[a-z]*")
FILTER_REGEX = re.compile(r"[^a-z '#]")
TOKEN_MAPPINGS = [(EMAIL_REGEX, "#email"), (FILTER_REGEX, ' ')]
def tokenize_line(line):
res = line.lower()
for regexp, replacement in TOKEN_MAPPINGS:
res = regexp.sub(replacement, res)
return res.split()
def tokenize(lines, token_size_filter=2):
tokens = fp.mapcat(tokenize_line, lines)
return [t for t in tokens if len(t) > token_size_filter]
def load_doc(filename):
group, doc_id = filename.split('/')[-2:]
with open(filename) as f:
doc = f.readlines()
return {'group': group,
'doc': doc,
'tokens': tokenize(doc),
'id': doc_id}
docs = pd.DataFrame(map(load_doc, glob('data/20news-bydate-train/*/*'))).set_index(['group','id'])
docs.head()
Out[5]:
In [6]:
def nltk_stopwords():
return set(nltk.corpus.stopwords.words('english'))
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
print('Building dictionary...')
dictionary = Dictionary(docs)
stopwords = nltk_stopwords().union(additional_stopwords)
stopword_ids = map(dictionary.token2id.get, stopwords)
dictionary.filter_tokens(stopword_ids)
dictionary.compactify()
dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
dictionary.compactify()
print('Building corpus...')
corpus = [dictionary.doc2bow(doc) for doc in docs]
return dictionary, corpus
In [7]:
dictionary, corpus = prep_corpus(docs['tokens'])
In [8]:
MmCorpus.serialize('newsgroups.mm', corpus)
dictionary.save('newsgroups.dict')
In [9]:
%%time
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, passes=10)
lda.save('newsgroups_50.model')
In [10]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis
In [11]:
vis_data = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)
Out[11]: